import geopandas as gpdimport matplotlib as pltimport pandas as pdimport seaborn as snsimport pandas_profilingimport mplleaflet as leafletfrom matplotlib.ticker import PercentFormatterimport matplotlib.pyplot as pltimport random as rnd%matplotlib inlinex
all_data = gpd.read_file('outputs/newvars_surveys.geojson')#Dropping variablesall_data.drop(columns = ["geometry"],inplace = True)all_data.drop(['index_right'], axis=1, inplace=True)all_data = pd.DataFrame(all_data)#cat and num variablescatvars = all_data.select_dtypes(include=['object','bool']).columnsnumvars = all_data.select_dtypes(include=['int64','float64']).columnsx
#Dropping outliersmerma_drop = all_data[all_data["cant_merma"] > 30].indexnum_entregas_drop = all_data[all_data["num_entregas_fv"] >15 ].indexcarga_drop = all_data[all_data["cant_carga"] > 350].indexall_data.drop([merma_drop[0],merma_drop[1]],inplace = True)all_data.drop([num_entregas_drop[0]],inplace = True)all_data.drop([i for i in carga_drop], inplace = True)all_data.drop([i for i in all_data[all_data["cant_sku"] > 40].index], inplace = True) all_data[all_data["cant_carga"] > 350]x
#CHANGING MISSING VALUES (0´s) to THE MEAN OF THE FEATUREmean_cant_carga = all_data[all_data["cant_carga"] > 0 ]["cant_carga"].mean()mean_cant_merma = all_data[all_data["cant_merma"] > 0 ]["cant_merma"].mean()mean_cant_sku = all_data[all_data["cant_sku"] > 0 ]["cant_sku"].mean()all_data.loc[all_data["cant_carga"] == 0,"cant_carga"] = mean_cant_carga + rnd.uniform(-all_data["cant_carga"].describe()[2],all_data["cant_carga"].describe()[2])all_data.loc[all_data["cant_merma"] == 0 ,"cant_merma"] = mean_cant_merma + rnd.uniform(-all_data["cant_merma"].describe()[2],all_data["cant_merma"].describe()[2])all_data.loc[all_data["cant_sku"] == 0 ,"cant_sku"] = mean_cant_sku + rnd.uniform(-all_data["cant_sku"].describe()[2],all_data["cant_sku"].describe()[2])rnd.uniform(-all_data["cant_carga"].describe()[2],all_data["cant_carga"].describe()[2])#Group by districts for zonenorth_lima = ["CARABAYLLO","COMAS", "INDEPENDENCIA","LOS OLIVOS", "PUENTE PIEDRA","SAN MARTIN DE PORRES","ANCON", "SANTA ROSA"]south_lima = ["CHORRILLOS","LURIN","PACHACAMAC","VILLA EL SALVADOR"]east_lima = ["SAN JUAN DE LURIGANCHO","ATE","LA MOLINA"]center_lima = ["SURQUILLO","SAN MIGUEL","BARRANCO", "SANTIAGO DE SURCO","SAN MIGUEL", "CHORRILLOS", "PUEBLO LIBRE", "LINCE","MAGDALENA DEL MAR"]def zona(x): if x in north_lima: return "north_lima" elif x in south_lima: return "south_lima" elif x in east_lima: return "east_lima" elif x in center_lima: return "center_lima"all_data["zone"] = all_data["distrito"].apply(zona)Pandas profilingPandas profiling
pandas_profiling.ProfileReport(all_data)xxxxxxxxxxCorrelations (pairplot)Correlations (pairplot)
x
# See any important correlation in numerical valuessns.pairplot(all_data[numvars])plt.show()xxxxxxxxxxall_data[]xxxxxxxxxxNumeric valuesNumeric values
#Actual correlation for numeric valuesfor cat in numvars: for cat2 in numvars: a = all_data[cat].corr(all_data[cat2]) if (a > 0.6) and (a<1) and (cat!=cat2): print("{} vs {} :".format(cat,cat2) ) print(a)sns.relplot(x='cant_carga', y="cant_merma", hue='superficie_bodega', size = "superficie_bodega", data=all_data)all_data[['hora_apertura_semana (hour)','hora_apertura_semana (minute)', 'hora_apertura_semana (seconds)']] = all_data.hora_apertura_semana.str.split(':', expand=True).astype(float)all_data[['hora_cierre_semana (hour)','hora_cierre_semana (minute)','hora_cierre_semana (seconds)']] = all_data.hora_cierre_semana.str.split(':', expand=True).astype(float)all_data[['hora_apertura_fin_de_semana (hour)','hora_apertura_fin_de_semana (minute)','hora_apertura_fin_de_semana (seconds)']] = all_data.hora_apertura_fin_de_semana.str.split(':', expand=True).astype(float)all_data[['hora_cierre_fin_de_semana (hour)','hora_cierre_fin_de_semana (minute)','hora_cierre_fin_de_semana (seconds)']] = all_data.hora_cierre_fin_de_semana.str.split(':', expand=True).astype(float)g = sns.catplot(x='hora_apertura_semana (hour)', kind="count", data=all_data)x
g = sns.catplot(x='hora_cierre_semana (hour)', kind="count", data=all_data)xxxxxxxxxxCategorical valuesCategorical values
all_data["venta_verduras"].corr(all_data["cant_sku"])for cat in catvars: ax = sns.catplot(x=cat, hue="zone", kind="count", data=all_data, palette="rocket", ) ax.set_xticklabels(rotation=45) plt.show()for cat in numvars: ax = sns.catplot(x=cat, hue="zone", kind="count", data=all_data, palette="rocket", ) ax.set_xticklabels(rotation=45) plt.show()all_data["zone"].value_counts()x
sns.relplot(y='betweenness_centrality_avg', x="cadena_frio", data=all_data)